{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 特征变换"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 对指化"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([ 2.71828183,  7.3890561 , 20.08553692, 54.59815003])"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.log([1,2,3,4])\n",
    "np.exp([1,2,3,4])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 离散化"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[low, low, low, medium, medium, medium, high, high, high]\n",
       "Categories (3, object): [low < medium < high]"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "lst = [6, 8, 10, 15, 16, 24, 25, 40, 67]\n",
    "# 等深分箱 平均处理数据长度\n",
    "pd.qcut(lst, q=3, labels=['low', 'medium', 'high'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[low, low, low, low, low, low, low, medium, high]\n",
       "Categories (3, object): [low < medium < high]"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 等宽分箱 平均处理数据大小\n",
    "pd.cut(lst, bins=3, labels=['low', 'medium', 'high'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 数值化"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/f/anaconda3/lib/python3.7/site-packages/sklearn/preprocessing/label.py:235: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n",
      "  y = column_or_1d(y, warn=True)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "array([1, 2, 0, 1, 0])"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.preprocessing import LabelEncoder, OneHotEncoder\n",
    "# LabelEncoder 默认按文本排序编码\n",
    "lb_encoder = LabelEncoder()\n",
    "lb_encoder.fit_transform(np.array(['low', 'medium', 'high', 'low', 'high']).reshape(-1, 1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[0., 0., 1.],\n",
       "       [1., 0., 0.],\n",
       "       [0., 1., 0.]])"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# OneHotEncoder可以特征扩维\n",
    "oh_encoder = OneHotEncoder()\n",
    "oh_encoder.fit_transform(np.array(['Red', 'Blue', 'Green']).reshape(-1, 1)).toarray()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 正规化"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "分为 L1 和 L2 两种，将特征间差距转化成特征长度的一个相对值"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import Normalizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 0.125,  0.125,  0.375, -0.125,  0.25 ]])"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "Normalizer(norm='l1').fit_transform(np.array([[1, 1, 3, -1, 2]]))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 归一化"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "数据按比例统一成0-1之间的形式"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/f/anaconda3/lib/python3.7/site-packages/sklearn/utils/validation.py:595: DataConversionWarning: Data with input dtype int64 was converted to float64 by MinMaxScaler.\n",
      "  warnings.warn(msg, DataConversionWarning)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "array([[0.  ],\n",
       "       [0.15],\n",
       "       [0.45],\n",
       "       [0.7 ],\n",
       "       [1.  ]])"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.preprocessing import MinMaxScaler, StandardScaler\n",
    "MinMaxScaler().fit_transform(np.array([1,4,10,15,21]).reshape(-1, 1))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 标准化"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "把数据转化为标准差为1, 用与展现一个数据同特征下与其他数据的相对大小差距关系"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/f/anaconda3/lib/python3.7/site-packages/sklearn/utils/validation.py:595: DataConversionWarning: Data with input dtype int64 was converted to float64 by StandardScaler.\n",
      "  warnings.warn(msg, DataConversionWarning)\n",
      "/home/f/anaconda3/lib/python3.7/site-packages/sklearn/utils/validation.py:595: DataConversionWarning: Data with input dtype int64 was converted to float64 by StandardScaler.\n",
      "  warnings.warn(msg, DataConversionWarning)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "array([[ 1.],\n",
       "       [ 1.],\n",
       "       [ 1.],\n",
       "       [ 1.],\n",
       "       [-1.],\n",
       "       [-1.],\n",
       "       [-1.],\n",
       "       [-1.]])"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "StandardScaler().fit_transform(np.array([1,1,1,1,0,0,0,0]).reshape(-1, 1))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 特征选择"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>A</th>\n",
       "      <th>B</th>\n",
       "      <th>C</th>\n",
       "      <th>D</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.856963</td>\n",
       "      <td>-1.018157</td>\n",
       "      <td>0.786064</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>-1.425053</td>\n",
       "      <td>-1.580649</td>\n",
       "      <td>-0.802966</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2.184314</td>\n",
       "      <td>-0.064868</td>\n",
       "      <td>-1.763257</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>-0.576157</td>\n",
       "      <td>-0.781799</td>\n",
       "      <td>0.057625</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>3.107851</td>\n",
       "      <td>0.588208</td>\n",
       "      <td>0.310231</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>0.908251</td>\n",
       "      <td>0.586399</td>\n",
       "      <td>0.740963</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>1.218341</td>\n",
       "      <td>-0.132951</td>\n",
       "      <td>-1.546310</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>-1.206157</td>\n",
       "      <td>0.076506</td>\n",
       "      <td>0.892009</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>-0.499073</td>\n",
       "      <td>-1.662156</td>\n",
       "      <td>-0.970675</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>1.572013</td>\n",
       "      <td>0.197386</td>\n",
       "      <td>1.675919</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          A         B         C  D\n",
       "0  0.856963 -1.018157  0.786064  1\n",
       "1 -1.425053 -1.580649 -0.802966  0\n",
       "2  2.184314 -0.064868 -1.763257  0\n",
       "3 -0.576157 -0.781799  0.057625  1\n",
       "4  3.107851  0.588208  0.310231  1\n",
       "5  0.908251  0.586399  0.740963  1\n",
       "6  1.218341 -0.132951 -1.546310  1\n",
       "7 -1.206157  0.076506  0.892009  1\n",
       "8 -0.499073 -1.662156 -0.970675  1\n",
       "9  1.572013  0.197386  1.675919  0"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import scipy.stats as ss\n",
    "df = pd.DataFrame({'A':ss.norm.rvs(size=10), \n",
    "                   'B':ss.norm.rvs(size=10), \n",
    "                   'C':ss.norm.rvs(size=10), \n",
    "                   'D':np.random.randint(0, 2, size=10)}) # D为标注\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.svm import SVR  # 回归器\n",
    "from sklearn.tree import DecisionTreeRegressor # 决策树回归器\n",
    "# 分别为过滤思想， 包裹思想， 嵌入思想\n",
    "from sklearn.feature_selection import SelectKBest, RFE, SelectFromModel\n",
    "X = df.loc[:, ['A', 'B', 'C']]\n",
    "Y = df.loc[:, 'D']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[-1.01815684,  0.78606404],\n",
       "       [-1.58064936, -0.80296574],\n",
       "       [-0.06486789, -1.76325705],\n",
       "       [-0.78179874,  0.05762494],\n",
       "       [ 0.58820762,  0.31023128],\n",
       "       [ 0.58639871,  0.74096339],\n",
       "       [-0.13295112, -1.54631015],\n",
       "       [ 0.07650625,  0.89200924],\n",
       "       [-1.66215577, -0.97067477],\n",
       "       [ 0.19738566,  1.67591949]])"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 过滤思想处理\n",
    "skb = SelectKBest(k=2) # 最终留下2个特征\n",
    "skb.fit(X, Y)\n",
    "skb.transform(X)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "此步骤过滤掉A列"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 0.85696295, -1.01815684],\n",
       "       [-1.4250526 , -1.58064936],\n",
       "       [ 2.18431358, -0.06486789],\n",
       "       [-0.57615749, -0.78179874],\n",
       "       [ 3.10785065,  0.58820762],\n",
       "       [ 0.90825144,  0.58639871],\n",
       "       [ 1.21834142, -0.13295112],\n",
       "       [-1.20615703,  0.07650625],\n",
       "       [-0.4990735 , -1.66215577],\n",
       "       [ 1.57201295,  0.19738566]])"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 包裹思想处理\n",
    "rfe = RFE(estimator=SVR(kernel=\"linear\"), n_features_to_select=2, step=1)\n",
    "rfe.fit_transform(X, Y)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "此步骤过滤掉C列"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 0.85696295, -1.01815684],\n",
       "       [-1.4250526 , -1.58064936],\n",
       "       [ 2.18431358, -0.06486789],\n",
       "       [-0.57615749, -0.78179874],\n",
       "       [ 3.10785065,  0.58820762],\n",
       "       [ 0.90825144,  0.58639871],\n",
       "       [ 1.21834142, -0.13295112],\n",
       "       [-1.20615703,  0.07650625],\n",
       "       [-0.4990735 , -1.66215577],\n",
       "       [ 1.57201295,  0.19738566]])"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 嵌入思想处理\n",
    "sfm = SelectFromModel(estimator=DecisionTreeRegressor(), threshold=0.01) # 重要因子\n",
    "sfm.fit_transform(X, Y)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "重要因子大小影响留下的列数"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 特征降维"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "常用 PCA 和 LDA 降维"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[-1.73205081],\n",
       "       [-1.73205081],\n",
       "       [-3.46410162],\n",
       "       [ 1.73205081],\n",
       "       [ 1.73205081],\n",
       "       [ 3.46410162]])"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.discriminant_analysis import LinearDiscriminantAnalysis\n",
    "# LDA降维\n",
    "X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])\n",
    "Y = np.array([1, 1, 1, 2, 2, 2])\n",
    "LinearDiscriminantAnalysis(n_components=1).fit_transform(X, Y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([2])"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 也可以当作分类器\n",
    "fisher_classifier = LinearDiscriminantAnalysis(n_components=1).fit(X, Y)\n",
    "fisher_classifier.predict([[0.8, 1]])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 1.38340578],\n",
       "       [ 2.22189802],\n",
       "       [ 3.6053038 ],\n",
       "       [-1.38340578],\n",
       "       [-2.22189802],\n",
       "       [-3.6053038 ]])"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.decomposition import PCA\n",
    "# PCA降维\n",
    "lower_dim = PCA(n_components=1)\n",
    "lower_dim.fit_transform(X)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 基于HR.csv做下练习"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "具体HR.csv数据全部的操作出门左拐看kaggle目录"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [],
   "source": [
    "def hr_preprocessing(sl=False, le=False, npr=False, amh=False, tsc=False, wa=False, \n",
    "                     pl=False, dep=False, sal=False,lower_d=False, ld_n=1):\n",
    "    def map_salary(s):\n",
    "        d = dict([('low', 0), ('medium', 1), ('high', 2)])\n",
    "        return d.get(s, 0)\n",
    "    \n",
    "    df = pd.read_csv(\"HR.csv\")\n",
    "    \n",
    "    # 清洗数据\n",
    "    df = df.dropna(subset=['satisfaction_level', 'last_evaluation'])\n",
    "    df = df[df['satisfaction_level']<=1][df['salary']!='nme']\n",
    "    # 标注\n",
    "    label = df['left']\n",
    "    df=df.drop('left', axis=1)\n",
    "    # 特征选择\n",
    "    # 特征处理\n",
    "    scaler_lst = [sl, le, npr, amh, tsc, wa, pl]\n",
    "    column_lst = ['satisfaction_level', 'last_evaluation', 'number_project', 'average_monthly_hours', \n",
    "                  'time_spend_company', 'Work_accident', 'promotion_last_5years']\n",
    "    for i in range(len(scaler_lst)):\n",
    "        if not scaler_lst[i]:\n",
    "            df[column_lst[i]] = MinMaxScaler().fit_transform(df[column_lst[i]].values.reshape(-1, 1)).reshape(1, -1)[0]\n",
    "        else:\n",
    "            df[column_lst[i]] = StandardScaler().fit_transform(df[column_lst[i]].values.reshape(-1, 1)).reshape(1, -1)[0]\n",
    "    # 对离散值处理\n",
    "    scaler_lst2 = [sal, dep]\n",
    "    column_lst2 = ['salary', 'department']\n",
    "    for i in range(len(scaler_lst2)):\n",
    "        if not scaler_lst2[i]:\n",
    "            if column_lst2[i] == 'salary':\n",
    "                df[column_lst2[i]] = [map_salary(s) for s in df['salary'].values]\n",
    "            else:\n",
    "                df[column_lst2[i]] = LabelEncoder().fit_transform(df[column_lst2[i]].values.reshape(-1, 1))\n",
    "            # 统一归一化处理\n",
    "            df[column_lst2[i]] = MinMaxScaler().fit_transform(df[column_lst2[i]].values.reshape(-1, 1))\n",
    "        else:\n",
    "            df = pd.get_dummies(df, columns=[column_lst2[i]])\n",
    "    if lower_d:\n",
    "        # 如果为True，使用PCA降维\n",
    "        return PCA(n_components=ld_n).fit_transform(df.values), label\n",
    "    return df, label"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(       satisfaction_level  last_evaluation  number_project  \\\n",
      "0               -0.936495        -1.087275       -1.462863   \n",
      "1                0.752814         0.840707        0.971113   \n",
      "2               -2.022479         0.957554        2.593763   \n",
      "3                0.431041         0.899131        0.971113   \n",
      "4               -0.976716        -1.145699       -1.462863   \n",
      "5               -0.815830        -1.262546       -1.462863   \n",
      "6               -2.062701         0.314894        1.782438   \n",
      "7                1.235474         0.782283        0.971113   \n",
      "8                1.114809         1.658639        0.971113   \n",
      "9               -0.775608        -1.087275       -1.462863   \n",
      "10              -0.654943        -1.028852       -1.462863   \n",
      "11              -2.022479         0.548588        1.782438   \n",
      "12               0.913701         1.191249        0.159788   \n",
      "13              -0.815830        -0.970428       -1.462863   \n",
      "14              -1.016938        -0.912004       -1.462863   \n",
      "15              -0.936495        -1.028852       -1.462863   \n",
      "16              -0.654943        -1.437818       -1.462863   \n",
      "17               0.672371         1.600215        0.159788   \n",
      "18              -0.654943        -1.204123       -1.462863   \n",
      "19               0.591928         1.015978        0.971113   \n",
      "20              -2.022479         0.665436        1.782438   \n",
      "21              -0.936495        -0.970428       -1.462863   \n",
      "22              -2.102922         1.366520        1.782438   \n",
      "23              -0.614722        -0.853580       -1.462863   \n",
      "24              -0.856051        -1.087275       -1.462863   \n",
      "25               1.114809         1.191249        0.971113   \n",
      "26               0.833258         0.899131        0.159788   \n",
      "27              -0.856051        -1.320970       -1.462863   \n",
      "28              -0.815830        -1.496241       -1.462863   \n",
      "29              -0.936495        -1.262546       -1.462863   \n",
      "...                   ...              ...             ...   \n",
      "14969           -0.735386        -1.496241       -1.462863   \n",
      "14970            0.672371         1.249673        0.159788   \n",
      "14971           -0.896273        -1.554665       -1.462863   \n",
      "14972           -2.022479         1.483368        1.782438   \n",
      "14973           -1.016938        -1.145699       -1.462863   \n",
      "14974           -1.016938        -1.028852       -1.462863   \n",
      "14975           -2.062701         0.431741        2.593763   \n",
      "14976           -0.856051        -1.437818       -1.462863   \n",
      "14977            0.793036         0.782283        0.159788   \n",
      "14978           -0.856051        -1.437818       -1.462863   \n",
      "14979           -2.102922         1.249673        1.782438   \n",
      "14980            0.591928         1.015978        0.971113   \n",
      "14981            0.471263         1.249673        0.971113   \n",
      "14982           -0.936495        -1.320970       -1.462863   \n",
      "14983            0.431041         0.723860        0.971113   \n",
      "14984           -0.856051        -0.912004       -1.462863   \n",
      "14985            1.195252         1.600215        0.971113   \n",
      "14986            0.953923         0.782283        0.159788   \n",
      "14987            1.155031        -0.094072        0.971113   \n",
      "14988           -0.614722        -0.970428       -1.462863   \n",
      "14989           -0.735386        -0.853580       -1.462863   \n",
      "14990            1.114809         0.957554        0.971113   \n",
      "14991           -2.102922         0.548588        1.782438   \n",
      "14992           -0.856051        -1.379394       -1.462863   \n",
      "14993            0.591928         0.665436        1.782438   \n",
      "14994           -0.856051        -0.853580       -1.462863   \n",
      "14995           -0.976716        -1.379394       -1.462863   \n",
      "14996           -0.976716        -1.087275       -1.462863   \n",
      "14997           -2.022479         1.424944        1.782438   \n",
      "14998           -0.976716        -1.145699       -1.462863   \n",
      "\n",
      "       average_monthly_hours  time_spend_company  Work_accident  \\\n",
      "0                  -0.882040               0.125            0.0   \n",
      "1                   1.220423               0.500            0.0   \n",
      "2                   1.420657               0.250            0.0   \n",
      "3                   0.439508               0.375            0.0   \n",
      "4                  -0.841993               0.125            0.0   \n",
      "5                  -0.962134               0.125            0.0   \n",
      "6                   0.920071               0.250            0.0   \n",
      "7                   1.160352               0.375            0.0   \n",
      "8                   0.459532               0.375            0.0   \n",
      "9                  -1.182392               0.125            0.0   \n",
      "10                 -1.322556               0.125            0.0   \n",
      "11                  2.081431               0.250            0.0   \n",
      "12                  0.659766               0.375            0.0   \n",
      "13                 -1.062251               0.125            0.0   \n",
      "14                 -1.282509               0.125            0.0   \n",
      "15                 -1.162368               0.125            0.0   \n",
      "16                 -0.821970               0.125            0.0   \n",
      "17                  1.080259               0.500            0.0   \n",
      "18                 -0.821970               0.125            1.0   \n",
      "19                  1.220423               0.375            0.0   \n",
      "20                  1.620892               0.250            0.0   \n",
      "21                 -1.082274               0.125            0.0   \n",
      "22                  2.061408               0.250            0.0   \n",
      "23                 -1.242462               0.125            0.0   \n",
      "24                 -0.862016               0.125            0.0   \n",
      "25                  0.819954               0.375            0.0   \n",
      "26                  0.759883               0.375            0.0   \n",
      "27                 -1.322556               0.125            0.0   \n",
      "28                 -1.462720               0.125            0.0   \n",
      "29                 -1.382626               0.125            0.0   \n",
      "...                      ...                 ...            ...   \n",
      "14969              -0.882040               0.125            0.0   \n",
      "14970               0.479555               0.375            0.0   \n",
      "14971              -1.222439               0.125            0.0   \n",
      "14972               2.181549               0.250            0.0   \n",
      "14973              -1.162368               0.125            0.0   \n",
      "14974              -0.962134               0.125            0.0   \n",
      "14975               2.181549               0.250            0.0   \n",
      "14976              -1.302532               0.125            0.0   \n",
      "14977               1.000165               0.500            0.0   \n",
      "14978              -1.142345               0.125            0.0   \n",
      "14979               1.901220               0.250            0.0   \n",
      "14980               0.739860               0.375            0.0   \n",
      "14981              -0.781923               0.250            0.0   \n",
      "14982              -1.282509               0.125            0.0   \n",
      "14983               1.120305               0.375            0.0   \n",
      "14984              -1.062251               0.125            0.0   \n",
      "14985               1.060235               0.375            0.0   \n",
      "14986               0.920071               0.500            0.0   \n",
      "14987               0.099109               0.250            0.0   \n",
      "14988              -1.122321               0.125            0.0   \n",
      "14989              -0.841993               0.125            1.0   \n",
      "14990               0.539625               0.375            1.0   \n",
      "14991               1.120305               0.250            0.0   \n",
      "14992              -0.922087               0.125            0.0   \n",
      "14993               1.841150               0.500            0.0   \n",
      "14994              -1.002181               0.125            0.0   \n",
      "14995              -0.821970               0.125            0.0   \n",
      "14996              -1.162368               0.125            0.0   \n",
      "14997               1.580845               0.250            0.0   \n",
      "14998              -0.862016               0.125            0.0   \n",
      "\n",
      "       promotion_last_5years  department  salary  \n",
      "0                        0.0    0.777778     0.0  \n",
      "1                        0.0    0.777778     0.5  \n",
      "2                        0.0    0.777778     0.5  \n",
      "3                        0.0    0.777778     0.0  \n",
      "4                        0.0    0.777778     0.0  \n",
      "5                        0.0    0.777778     0.0  \n",
      "6                        0.0    0.777778     0.0  \n",
      "7                        0.0    0.777778     0.0  \n",
      "8                        0.0    0.777778     0.0  \n",
      "9                        0.0    0.777778     0.0  \n",
      "10                       0.0    0.777778     0.0  \n",
      "11                       0.0    0.777778     0.0  \n",
      "12                       0.0    0.777778     0.0  \n",
      "13                       0.0    0.777778     0.0  \n",
      "14                       0.0    0.777778     0.0  \n",
      "15                       0.0    0.777778     0.0  \n",
      "16                       0.0    0.777778     0.0  \n",
      "17                       0.0    0.777778     0.0  \n",
      "18                       1.0    0.777778     0.0  \n",
      "19                       0.0    0.777778     0.0  \n",
      "20                       0.0    0.777778     0.0  \n",
      "21                       0.0    0.777778     0.0  \n",
      "22                       0.0    0.777778     0.0  \n",
      "23                       0.0    0.777778     0.0  \n",
      "24                       0.0    0.777778     0.0  \n",
      "25                       0.0    0.777778     0.0  \n",
      "26                       0.0    0.777778     0.0  \n",
      "27                       0.0    0.777778     0.0  \n",
      "28                       0.0    0.222222     0.0  \n",
      "29                       0.0    0.222222     0.0  \n",
      "...                      ...         ...     ...  \n",
      "14969                    0.0    0.777778     0.5  \n",
      "14970                    0.0    0.777778     0.5  \n",
      "14971                    0.0    0.777778     0.5  \n",
      "14972                    0.0    0.222222     0.5  \n",
      "14973                    0.0    0.222222     0.5  \n",
      "14974                    0.0    0.222222     0.5  \n",
      "14975                    0.0    0.333333     0.5  \n",
      "14976                    0.0    0.333333     0.5  \n",
      "14977                    0.0    0.333333     0.5  \n",
      "14978                    0.0    0.333333     0.5  \n",
      "14979                    0.0    1.000000     0.5  \n",
      "14980                    0.0    1.000000     1.0  \n",
      "14981                    0.0    1.000000     0.0  \n",
      "14982                    0.0    1.000000     0.5  \n",
      "14983                    0.0    1.000000     0.5  \n",
      "14984                    0.0    1.000000     0.5  \n",
      "14985                    0.0    1.000000     0.5  \n",
      "14986                    0.0    1.000000     0.0  \n",
      "14987                    0.0    1.000000     0.0  \n",
      "14988                    0.0    1.000000     0.0  \n",
      "14989                    0.0    1.000000     0.0  \n",
      "14990                    0.0    0.888889     0.0  \n",
      "14991                    0.0    0.888889     0.0  \n",
      "14992                    0.0    0.888889     0.0  \n",
      "14993                    0.0    0.888889     0.0  \n",
      "14994                    0.0    0.888889     0.0  \n",
      "14995                    0.0    0.888889     0.0  \n",
      "14996                    0.0    0.888889     0.0  \n",
      "14997                    0.0    0.888889     0.0  \n",
      "14998                    0.0    0.888889     0.0  \n",
      "\n",
      "[14999 rows x 9 columns], 0        1\n",
      "1        1\n",
      "2        1\n",
      "3        1\n",
      "4        1\n",
      "5        1\n",
      "6        1\n",
      "7        1\n",
      "8        1\n",
      "9        1\n",
      "10       1\n",
      "11       1\n",
      "12       1\n",
      "13       1\n",
      "14       1\n",
      "15       1\n",
      "16       1\n",
      "17       1\n",
      "18       1\n",
      "19       1\n",
      "20       1\n",
      "21       1\n",
      "22       1\n",
      "23       1\n",
      "24       1\n",
      "25       1\n",
      "26       1\n",
      "27       1\n",
      "28       1\n",
      "29       1\n",
      "        ..\n",
      "14969    1\n",
      "14970    1\n",
      "14971    1\n",
      "14972    1\n",
      "14973    1\n",
      "14974    1\n",
      "14975    1\n",
      "14976    1\n",
      "14977    1\n",
      "14978    1\n",
      "14979    1\n",
      "14980    1\n",
      "14981    1\n",
      "14982    1\n",
      "14983    1\n",
      "14984    1\n",
      "14985    1\n",
      "14986    1\n",
      "14987    1\n",
      "14988    1\n",
      "14989    1\n",
      "14990    1\n",
      "14991    1\n",
      "14992    1\n",
      "14993    1\n",
      "14994    1\n",
      "14995    1\n",
      "14996    1\n",
      "14997    1\n",
      "14998    1\n",
      "Name: left, Length: 14999, dtype: int64)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/f/anaconda3/lib/python3.7/site-packages/sklearn/utils/validation.py:595: DataConversionWarning: Data with input dtype int64 was converted to float64 by StandardScaler.\n",
      "  warnings.warn(msg, DataConversionWarning)\n",
      "/home/f/anaconda3/lib/python3.7/site-packages/sklearn/utils/validation.py:595: DataConversionWarning: Data with input dtype int64 was converted to float64 by StandardScaler.\n",
      "  warnings.warn(msg, DataConversionWarning)\n",
      "/home/f/anaconda3/lib/python3.7/site-packages/sklearn/utils/validation.py:595: DataConversionWarning: Data with input dtype int64 was converted to float64 by StandardScaler.\n",
      "  warnings.warn(msg, DataConversionWarning)\n",
      "/home/f/anaconda3/lib/python3.7/site-packages/sklearn/utils/validation.py:595: DataConversionWarning: Data with input dtype int64 was converted to float64 by StandardScaler.\n",
      "  warnings.warn(msg, DataConversionWarning)\n",
      "/home/f/anaconda3/lib/python3.7/site-packages/sklearn/utils/validation.py:595: DataConversionWarning: Data with input dtype int64 was converted to float64 by MinMaxScaler.\n",
      "  warnings.warn(msg, DataConversionWarning)\n",
      "/home/f/anaconda3/lib/python3.7/site-packages/sklearn/utils/validation.py:595: DataConversionWarning: Data with input dtype int64 was converted to float64 by MinMaxScaler.\n",
      "  warnings.warn(msg, DataConversionWarning)\n",
      "/home/f/anaconda3/lib/python3.7/site-packages/sklearn/utils/validation.py:595: DataConversionWarning: Data with input dtype int64 was converted to float64 by MinMaxScaler.\n",
      "  warnings.warn(msg, DataConversionWarning)\n",
      "/home/f/anaconda3/lib/python3.7/site-packages/sklearn/utils/validation.py:595: DataConversionWarning: Data with input dtype int64 was converted to float64 by MinMaxScaler.\n",
      "  warnings.warn(msg, DataConversionWarning)\n",
      "/home/f/anaconda3/lib/python3.7/site-packages/sklearn/preprocessing/label.py:235: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n",
      "  y = column_or_1d(y, warn=True)\n",
      "/home/f/anaconda3/lib/python3.7/site-packages/sklearn/utils/validation.py:595: DataConversionWarning: Data with input dtype int64 was converted to float64 by MinMaxScaler.\n",
      "  warnings.warn(msg, DataConversionWarning)\n"
     ]
    }
   ],
   "source": [
    "df_res = hr_preprocessing(sl=True, le=True, npr=True, amh=True)\n",
    "print(df_res)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
